library(tidyverse)
library(plotly)
library(ggplot2)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(scales)
• N = Not Yet Graded• A = Grade A• B = Grade B• C = Grade C• Z = Grade Pending• P= Grade Pending issued on re-opening following an initial inspection that resulted in a closure
Indicator of critical violation; “• Critical • Not Critical • Not Applicable”; Critical violations are those most likely to contribute to food-borne illness
# The same data cleaning process in demographics part, also only consider data with valid violation_description and critical_flag, and grade A, B, C.
manhattan_data = read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
str (manhattan_data)
## spc_tbl_ [94,616 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ CAMIS : num [1:94616] 50140436 50158081 50152703 50160975 50161087 ...
## $ DBA : chr [1:94616] "JUST SALAD" "THE MANNER" "MIDNIGHT BLUE" "BLUE BLOSSOM" ...
## $ BORO : chr [1:94616] "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
## $ BUILDING : chr [1:94616] "2853" "58" "106" "108" ...
## $ STREET : chr [1:94616] "BROADWAY" "THOMPSON STREET" "EAST 19 STREET" "WEST 39 STREET" ...
## $ ZIPCODE : num [1:94616] 10025 10012 10003 10018 10025 ...
## $ PHONE : num [1:94616] 7.32e+09 9.17e+09 3.48e+09 6.47e+09 9.29e+09 ...
## $ CUISINE DESCRIPTION : chr [1:94616] NA NA NA NA ...
## $ INSPECTION DATE : chr [1:94616] "01/01/1900" "01/01/1900" "01/01/1900" "01/01/1900" ...
## $ ACTION : chr [1:94616] NA NA NA NA ...
## $ VIOLATION CODE : chr [1:94616] NA NA NA NA ...
## $ VIOLATION DESCRIPTION: chr [1:94616] NA NA NA NA ...
## $ CRITICAL FLAG : chr [1:94616] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ SCORE : num [1:94616] NA NA NA NA NA NA NA NA NA NA ...
## $ GRADE : chr [1:94616] NA NA NA NA ...
## $ GRADE DATE : chr [1:94616] NA NA NA NA ...
## $ RECORD DATE : chr [1:94616] "11/05/2024" "11/05/2024" "11/05/2024" "11/05/2024" ...
## $ INSPECTION TYPE : chr [1:94616] NA NA NA NA ...
## $ Latitude : num [1:94616] 40.8 40.7 40.7 40.8 40.8 ...
## $ Longitude : num [1:94616] -74 -74 -74 -74 -74 ...
## $ Community Board : num [1:94616] 109 102 105 105 107 108 101 105 104 102 ...
## $ Council District : chr [1:94616] "07" "01" "02" "04" ...
## $ Census Tract : chr [1:94616] "019900" "004700" "005000" "011300" ...
## $ BIN : num [1:94616] 1075440 1087362 1017905 1015273 1055676 ...
## $ BBL : num [1:94616] 1.02e+09 1.00e+09 1.01e+09 1.01e+09 1.02e+09 ...
## $ NTA : chr [1:94616] "MN09" "MN24" "MN21" "MN17" ...
## $ Location Point1 : logi [1:94616] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. CAMIS = col_double(),
## .. DBA = col_character(),
## .. BORO = col_character(),
## .. BUILDING = col_character(),
## .. STREET = col_character(),
## .. ZIPCODE = col_double(),
## .. PHONE = col_double(),
## .. `CUISINE DESCRIPTION` = col_character(),
## .. `INSPECTION DATE` = col_character(),
## .. ACTION = col_character(),
## .. `VIOLATION CODE` = col_character(),
## .. `VIOLATION DESCRIPTION` = col_character(),
## .. `CRITICAL FLAG` = col_character(),
## .. SCORE = col_double(),
## .. GRADE = col_character(),
## .. `GRADE DATE` = col_character(),
## .. `RECORD DATE` = col_character(),
## .. `INSPECTION TYPE` = col_character(),
## .. Latitude = col_double(),
## .. Longitude = col_double(),
## .. `Community Board` = col_double(),
## .. `Council District` = col_character(),
## .. `Census Tract` = col_character(),
## .. BIN = col_double(),
## .. BBL = col_double(),
## .. NTA = col_character(),
## .. `Location Point1` = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
cleaned_data = manhattan_data %>%
janitor::clean_names() %>%
filter(
!is.na(dba),
!is.na(cuisine_description),
!is.na(grade),
!is.na(score),
!is.na(zipcode),
!is.na(violation_description),
!is.na(critical_flag),
grade %in% c("A", "B", "C")
) %>% mutate(region = case_when(
zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
TRUE ~ "Other" # For ZIP codes like 11371, 12345, etc.
))
# Analyze the effect of violation_description on grade
# Group by critical flag and grade to calculate counts
violation_grade_summary <- cleaned_data %>%
group_by(critical_flag, grade) %>%
summarize(count = n(), .groups = "drop")
# Calculate average scores by CRITICAL_FLAG and grade
average_score <- cleaned_data %>%
group_by(critical_flag, grade) %>%
summarize(avg_score = mean(score), .groups = "drop")
# Create an EDA using plotly
violation_grade = violation_grade_summary %>%
plot_ly(
x = ~grade,
y = ~count,
type = 'bar',
color = ~critical_flag,
colors = "viridis") %>%
layout(
title = "Effect of Violation Type on Restaurant Grade",
xaxis = list(title = "Grade"),
yaxis = list(title = "Restaurant Number")
)
violation_score = average_score %>%
plot_ly(
x = ~grade,
y = ~avg_score,
type = 'bar',
color = ~critical_flag,
colors = "viridis") %>%
layout(
title = "Effect of Violation Type on Restaurant Score",
xaxis = list(title = "Grade"),
yaxis = list(title = "Restaurant Average Score")
)
Chi-Square Test between critical_flag and grade: conclusion: p-value<0.05, reject null, they are significant related.
contingency_table <- table(cleaned_data$critical_flag, cleaned_data$grade)
chi_test <- chisq.test(contingency_table)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 821.92, df = 2, p-value < 2.2e-16
# Group by cuisine_description and calculate total and critical counts
violation_cuisine <- cleaned_data %>%
mutate(critical_flag = ifelse(critical_flag == "Critical", 1, 0)) %>%
group_by(cuisine_description) %>%
summarize(total_violations = n(),
critical_violations = sum(critical_flag, na.rm = TRUE),
critical_percent = (critical_violations / total_violations),
critical_percentage = percent(critical_percent, accuracy = 0.01)
) %>%
arrange(desc(critical_percentage)) %>% # Sort by percentage of critical violations
head(10) # View top 10 restaurant types with the highest percentage of critical violations
plot_ly(
type = 'table',
header = list(
values = c("Cuisine Description", "Critical Percentage"),
align = c("center", "center"),
font = list(size = 14, color = "white"),
fill = list(color = "purple")
),
cells = list(
values = rbind(violation_cuisine$cuisine_description, violation_cuisine$critical_percentage),
align = c("center", "center"),
font = list(size = 12),
fill = list(color = c("white", "white"))
)
)
what type of violations are most comment in Chinese/Japanese cuisine:
chinese_japanese <- cleaned_data %>%
filter(cuisine_description == "Chinese/Japanese") %>% # 42 rows in total
group_by(violation_description) %>%
summarize(num_violation = n(),
violation_percent = num_violation / 42,
violation_percentage = percent(violation_percent, accuracy = 0.01)) %>%
arrange(desc(violation_percent))
what type of violations are most comment in critical violation: